Internet Publisher's Toolbox 2.0

home *** CD-ROM | disk | FTP | other *** search

/ Internet Publisher's Toolbox 2.0 / Internet Publisher's Toolbox.iso / internet / ntserver / wtsource / irinv.c < prev next >

Wrap

C/C++ Source or Header | 1994-11-14 | 41.0 KB | 1,198 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* Copyright (c) CNIDR (see ../COPYRIGHT) */ /* Change log: * $Log: irinv.c,v $ * Revision 1.3 1993/09/23 03:00:07 pfeifer * undid swap. * Fixed bug with string compare (strcmp) for ISO-Chars * * Revision 1.2 1993/09/23 02:12:20 pfeifer * swaped from_version and into_version merge_index_file * * Revision 1.1 1993/02/16 15:05:35 freewais * Initial revision * * Revision 1.28 92/04/28 16:55:50 morris * added boolean to serial engine * * Revision 1.27 92/03/19 09:34:26 morris * fixed the dictionary header to accurately indicate the number of blocks * * Revision 1.26 92/03/01 16:11:16 brewster * took out the analyze_hashtable_distribution * * Revision 1.25 92/02/12 13:27:31 jonathan * Added "$Log: irinv.c,v $ * Revision 1.3 1993/09/23 03:00:07 pfeifer * undid swap. * Fixed bug with string compare (strcmp) for ISO-Chars * * Revision 1.2 1993/09/23 02:12:20 pfeifer * swaped from_version and into_version merge_index_file * * Revision 1.1 1993/02/16 15:05:35 freewais * Initial revision * * Revision 1.28 92/04/28 16:55:50 morris * added boolean to serial engine * * Revision 1.27 92/03/19 09:34:26 morris * fixed the dictionary header to accurately indicate the number of blocks * * Revision 1.26 92/03/01 16:11:16 brewster * took out the analyze_hashtable_distribution * " so RCS will put the log message in the header * */ /* Inverted file accessor functions. Main functions: finished_add_word */ #include <string.h> /* for memset() */ #include "cutil.h" #include "futil.h" #include "irhash.h" #include "hash.h" #include "panic.h" #include "irfiles.h" #include "irext.h" /* ================== */ /* === Index file === */ /* ================== */ /*This is an implementation of the inverted file itself. * An index_block_number is the position in the file that the * entry starts. * Index block 0 is the null pointer. * The header contains the number of different words in the file. * */ /* the format of an index block is: * in the first byte: * INDEX_BLOCK_FULL_FLAG * || INDEX_BLOCK_NOT_FULL_FLAG * || INDEX_BLOCK_DICTIONARY_FLAG * if index_block_full_flag * next_index_block * index_block_size * stuff * (the number of valid entries is index_block_size - index_block_header_size) * if index_block_not_full_flag * number_of_valid_entries * index_block_size * stuff * if index_block_dictionary_flag * last_index_block NEXT_INDEX_BLOCK_SIZE (0 when this is the last) * index_block_size INDEX_BLOCK_SIZE_SIZE * number_of_occurances NUMBER_OF_OCCURANCES_SIZE * word (followed by \n) * * * "Stuff" is a list of word occurances of the format: * (this is written in irhash.c and read in sersrch.c) * doc_id * character_position * weight * doc_id * character_position * weight * etc. * * It should be (probably in release 9): * doc_id (with high bit set) * weight * character_position * ... * character_position * doc_id * weight * character_position * etc. * */ long write_dictionary_index_block(number_of_occurances,word,stream) long number_of_occurances; char *word; FILE *stream; /* returns a pointer to the index block allocated */ { /* this assumes that the stream is positioned at the end */ long before_length = ftell(stream); /* file_length(stream); */ long index_block_size = INDEX_BLOCK_FLAG_SIZE + NEXT_INDEX_BLOCK_SIZE + INDEX_BLOCK_SIZE_SIZE + NUMBER_OF_OCCURANCES_SIZE + strlen(word) + strlen("\n"); /* this is done this way for PC's (necessary?) */ /* printf("writing dict entry to position %ld\n", before_length); */ /* grow the file */ /* in this implementation, the file is always filled by the fwrite, * so it will grow itself. * grow_file(stream, before_length + how_large); * file length leaves the stream at the end, so we are all set. * s_fseek(stream, before_length, SEEK_SET); */ write_bytes(INDEX_BLOCK_DICTIONARY_FLAG, INDEX_BLOCK_FLAG_SIZE, stream); write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, stream); write_bytes(index_block_size, INDEX_BLOCK_SIZE_SIZE, stream); write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, stream); if(strlen(word) > MAX_WORD_LENGTH) /* for debugging */ waislog(WLOG_HIGH, WLOG_ERROR, "Word: '%s' is too long (%ld characters) for some reason.", word, strlen(word)); fprintf(stream, "%s\n", word); /* just to check { long after_length = ftell(stream); if(after_length - before_length != index_block_size){ waislog(WLOG_HIGH, WLOG_ERROR, "dictionary entry size is %ld, and we thought %ld", after_length - before_length, index_block_size); } } */ return(before_length); } #ifdef everneeded static long modify_dictionary_index_block _AP((long index_block,long last_index_block,long number_of_occurances, FILE* stream)); static long modify_dictionary_index_block(index_block,last_index_block,number_of_occurances,stream) long index_block; long last_index_block; long number_of_occurances; FILE* stream; /* this does not allow one to change the word itself, only the entries around it. It will panic if the index_block is not a valid block. This returns the the stream to pointing at the end of the file. */ { s_fseek(stream, index_block, SEEK_SET); if(INDEX_BLOCK_DICTIONARY_FLAG != read_bytes(INDEX_BLOCK_FLAG_SIZE, stream)) panic("the index block %ld is not a legal dictionary block", index_block); write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, stream); write_bytes(last_index_block, NEXT_INDEX_BLOCK_SIZE, stream); read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); /* ignore it */ write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, stream); s_fseek(stream, 0L, SEEK_END); } #endif /* def everneeded */ static long next_dictionary_index_block _AP((long* index_block_size,long* number_of_occurances,char* word, FILE* stream)); static long next_dictionary_index_block(index_block_size,number_of_occurances,word,stream) long *index_block_size; long *number_of_occurances; char *word; FILE *stream; { /* this read the dictionary index block from the index stream. It assumes the stream is positioned at the start of a dictionary block, and will return non-0 if it is not. returns 0 if it succeeds. returns -1 if it is at the of a file. returns -2 if it read something strange. Always sets word length to 0 if it fails. */ long index_block_flag; char temp[MAX_WORD_LENGTH + 2]; word[0] = '\0'; index_block_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream); if(index_block_flag == EOF){ /* not an error, it is the way it knows it is done waislog(WLOG_HIGH, WLOG_ERROR, "Hit the end of the inverted file too early"); */ return(-1); } if(index_block_flag != INDEX_BLOCK_DICTIONARY_FLAG){ waislog(WLOG_HIGH, WLOG_ERROR, "Did not find a dictionary entry, flag is %ld at file position %ld", index_block_flag, ftell(stream) - INDEX_BLOCK_FLAG_SIZE); return(-2); } (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, stream); *index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); *number_of_occurances = read_bytes(NUMBER_OF_OCCURANCES_SIZE, stream); fgets(temp, MAX_WORD_LENGTH + 2, stream); /* 2 is for the \n and '\0' */ /* trim the \n */ if(temp[strlen(temp) - 1] == '\n'){ temp[strlen(temp) - 1] = '\0'; } strcpy(word, temp); /* printf("Merging word '%s'\n", word); */ return(0); } long read_dictionary_index_block(index_block, last_index_block, index_block_size, number_of_occurances, word, stream) long index_block; long *last_index_block; long *index_block_size; long *number_of_occurances; char *word; FILE *stream; { /* this read the dictionary index block from the index stream. returns 0 if it succeeds. */ long answer; s_fseek(stream, index_block, SEEK_SET); if(0 > (answer = next_dictionary_index_block(index_block_size, number_of_occurances, word, stream))) panic("read dictionary block %ld failed", index_block); *last_index_block = 0; return(answer); } long allocate_index_block(how_large,stream) long how_large; FILE* stream; { /* This returns a pointer for an index block. It creates the space and writes the header. how_large includes the header. Returns the block_number (the byte address of first byte in the block */ /* this assumes that the stream is positioned at the end */ long before_length = ftell(stream); /* file_length(stream); */ /* grow the file */ /* in this implementation, the file is always filled by the fwrite, * so it will grow itself. * grow_file(stream, before_length + how_large); * file length leaves the stream at the end, so we are all set. * s_fseek(stream, before_length, SEEK_SET); */ write_bytes(INDEX_BLOCK_FULL_FLAG, /* in this version all are full */ INDEX_BLOCK_FLAG_SIZE, stream); write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, stream); write_bytes(how_large, INDEX_BLOCK_SIZE_SIZE, stream); return(before_length); } #ifdef testing static void scan_index_blocks _AP((char* filename,boolean verbose)); static void scan_index_blocks(filename,verbose) char *filename; boolean verbose; { /* this is a debugging routine for checking the inverted index file */ long current_index_block = INDEX_HEADER_SIZE; FILE *stream = s_fopen(filename, "rb"); long length = file_length(stream); s_fseek(stream, current_index_block, SEEK_SET); printf("File length %ld\n", length); while(current_index_block < length){ long flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream); long next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream); long index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream); if(flag == INDEX_BLOCK_DICTIONARY_FLAG){ long last_index_block; long index_block_size; long number_of_occurances; char word[MAX_WORD_LENGTH + 1]; if(0 > read_dictionary_index_block(current_index_block, &last_index_block, &index_block_size, &number_of_occurances, word, stream)) panic("read dictionary index block failed"); if(verbose) printf("%5ld: size %3ld Dictionary entry '%s', number of occurances %ld last block %ld\n", current_index_block, index_block_size, word, number_of_occurances, next_index_block); } else if(flag == INDEX_BLOCK_NOT_FULL_FLAG){ if(verbose) printf("%5ld: size %3ld Not full, valid entries %ld\n", current_index_block, index_block_size, next_index_block); } else if(flag == INDEX_BLOCK_FULL_FLAG){ if(verbose) printf("%5ld: size %3ld full block, next block %ld\n", current_index_block, index_block_size, next_index_block); } else panic("bad entry %ld (ftell %ld), flag was %ld", current_index_block, ftell(stream), flag); current_index_block += index_block_size; s_fseek(stream, current_index_block, SEEK_SET); } s_fclose(stream); } #endif /* def testing */ #define COPY_BLOCK_BUFFER_LENGTH 1000 static long copy_stream _AP((FILE* from_stream,FILE* to_stream,long n)); static long copy_stream(from_stream,to_stream,n) FILE *from_stream; FILE *to_stream; long n; { char buffer[COPY_BLOCK_BUFFER_LENGTH]; while(n > 0){ /* keep reading until we are done */ long amount_read = fread(buffer, sizeof(char), MIN(n, COPY_BLOCK_BUFFER_LENGTH), from_stream); if(amount_read == 0 || amount_read == EOF) return(-1); #ifdef WIN32 if(amount_read != (long)fwrite(buffer, sizeof(char), amount_read, to_stream)) #else if(amount_read != fwrite(buffer, sizeof(char), amount_read, to_stream)) #endif return(-1); n -= amount_read; } return(0); } static void merge_blocks _AP((char* word,FILE* from_stream_1, FILE* from_stream_2,FILE* to_stream)); static void merge_blocks(word,from_stream_1,from_stream_2,to_stream) char* word; FILE *from_stream_1; FILE *from_stream_2; FILE *to_stream; /* puts from_stream_1 first into to_stream then from_stream_2*/ { long flag; long index_block_size; long other_block_size; flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_1); (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_1); index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_1); if(flag == EOF) return; if(flag != INDEX_BLOCK_FULL_FLAG) panic("the next index block is not a full block"); flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_2); (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_2); other_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_2); if(flag != INDEX_BLOCK_FULL_FLAG) panic("the next index block is not a full block"); write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream); write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, to_stream); if((index_block_size + other_block_size) > (1L << (INDEX_BLOCK_SIZE_SIZE * 8))){ /* then the block is too large to be represented in the index_block_size_size. This routine takes the radical step to eliminate it completely. The "right" thing to do is to chain blocks, but I dont think it is worth it since this should not happen very often. If it does happen often then bump up INDEX_BLOCK_SIZE_SIZE. */ waislog(WLOG_LOW, WLOG_INFO, "Can not merge the index block for %s, since it would create one that is too big. Deleting it.",word); write_bytes(INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream); s_fseek(from_stream_1, index_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR); s_fseek(from_stream_2, other_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR); } else{ /* copy away */ write_bytes(index_block_size + other_block_size - INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream); copy_stream(from_stream_1, to_stream, index_block_size - INDEX_BLOCK_HEADER_SIZE); copy_stream(from_stream_2, to_stream, other_block_size - INDEX_BLOCK_HEADER_SIZE); } } static void make_dummy_block _AP((FILE* from_stream_1,FILE* from_stream_2, FILE* to_stream)); static void make_dummy_block(from_stream_1,from_stream_2,to_stream) FILE *from_stream_1; FILE *from_stream_2; FILE *to_stream; /* deletes block from_stream_1 and from_stream_2 and makes a 0 length block on to_stream */ { long flag; long index_block_size; long other_block_size; flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_1); (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_1); index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_1); if(flag == EOF) return; if(flag != INDEX_BLOCK_FULL_FLAG) panic("the next index block is not a full block"); flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream_2); (void)read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream_2); other_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream_2); if(flag != INDEX_BLOCK_FULL_FLAG) panic("the next index block is not a full block"); write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream); write_bytes(0L, NEXT_INDEX_BLOCK_SIZE, to_stream); write_bytes(INDEX_BLOCK_HEADER_SIZE, INDEX_BLOCK_SIZE_SIZE, to_stream); s_fseek(from_stream_1, index_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR); s_fseek(from_stream_2, other_block_size - INDEX_BLOCK_HEADER_SIZE, SEEK_CUR); } static void copy_block _AP((FILE* from_stream,FILE* to_stream)); static void copy_block(from_stream,to_stream) FILE *from_stream; FILE *to_stream; /* copies an index block from one stream to another */ { /* copies an index block from from_stream to to_stream */ long flag; long next_index_block; long index_block_size; flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, from_stream); next_index_block = read_bytes(NEXT_INDEX_BLOCK_SIZE, from_stream); index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, from_stream); if(flag == EOF) return; write_bytes(flag, INDEX_BLOCK_FLAG_SIZE, to_stream); write_bytes(next_index_block, NEXT_INDEX_BLOCK_SIZE, to_stream); write_bytes(index_block_size, INDEX_BLOCK_SIZE_SIZE, to_stream); /* copy the real stuff */ copy_stream(from_stream, to_stream, index_block_size - INDEX_BLOCK_HEADER_SIZE); } /* ================== */ /* === Merge File === */ /* ================== */ static boolean merge_index_file _AP((long from_version, long into_version, database* db, boolean generate_dictionary)); /* Merges a index file (from_version) into another (into_version). Returns true if successful */ static boolean merge_index_file(into_version, from_version, db,generate_dictionary) long into_version; long from_version; database* db; boolean generate_dictionary; /* This is done by merging both into version -1 and then, renames it to into_version, then deletes from_version. */ { char input_filename_1[MAX_FILENAME_LEN]; /* into_version file */ char input_filename_2[MAX_FILENAME_LEN]; /* from_version file */ char output_filename[MAX_FILENAME_LEN]; /* version -1 */ FILE *input_stream_1; FILE *input_stream_2; FILE *output_stream; char input_word_1[MAX_WORD_LENGTH + 1]; char input_word_2[MAX_WORD_LENGTH + 1]; if(generate_dictionary) { if(into_version == -2) waislog(WLOG_MEDIUM, WLOG_INDEX, "Merging version %ld into master version and generating dictionary.", from_version); else waislog(WLOG_MEDIUM, WLOG_INDEX, "Merging version %ld into %ld and generating dictionary.", from_version, into_version); } else { waislog(WLOG_MEDIUM, WLOG_INDEX, "Merging version %ld and %ld", into_version, from_version); } index_filename_with_version(into_version, input_filename_1, db); index_filename_with_version(from_version, input_filename_2, db); index_filename_with_version(-1L, output_filename, db); if(NULL == (input_stream_1 = s_fopen(input_filename_1, "rb"))) return(false); if(NULL == (input_stream_2 = s_fopen(input_filename_2, "rb"))) return(false); if(NULL == (output_stream = s_fopen(output_filename, "wb"))) return(false); { long index_block_size_1; long number_of_occurances_1; long index_block_size_2; long number_of_occurances_2; /* leave room for the number_of_words in the output */ s_fseek(output_stream, INDEX_HEADER_SIZE, SEEK_SET); /* read the number of words in the 2 input files, this is the maximum number of words that can be in the dictionary, It is used by init_dict_file to set asside space for the dictionary_header_block. It is very likley that some of the words are duplicates, so the actual size of the header_block will be smaller. But we never know until we do the merge, so we have to give enough space just in case. NOTE - in the case where generate_dictioary is false, we still need to do this because we need to skip the header_size in each of the input streams */ db->number_of_words = read_bytes(INDEX_HEADER_SIZE,input_stream_1) + read_bytes(INDEX_HEADER_SIZE,input_stream_2); if (generate_dictionary) /* allocate the header block if necessary */ { init_dict_file_for_writing(db); } /* now that the dictioanry_header_block is allocated, we can start counting for real */ db->number_of_words = 0; if(-1 > next_dictionary_index_block(&index_block_size_1, &number_of_occurances_1, input_word_1, input_stream_1)) panic("Read of dictionary block failed in file %s", input_filename_1); if(-1 > next_dictionary_index_block(&index_block_size_2, &number_of_occurances_2, input_word_2, input_stream_2)) panic("Read of dictionary block failed in file %s", input_filename_2); while(true){ long strlen_1 = strlen(input_word_1); long strlen_2 = strlen(input_word_2); long compare = strcmp(input_word_1, input_word_2); /* +1 if 1 is bigger */ if (!strlen_1) { /* Make it work for iso chars */ if (!strlen_2) compare = 0; else compare = -1; } if((0 == strlen_1) && (0 == strlen_2)){ /* printf("Done with merge\n"); */ /* then we are done. */ break; } else if((0 != strlen_1) && (0 != strlen_2) && (0 == compare)){ /* they are equal */ /* printf("Merging word %s and %s\n", input_word_1, input_word_2); */ write_dictionary_index_block(number_of_occurances_1 + number_of_occurances_2, input_word_1, output_stream); if(generate_dictionary){ add_word_to_dictionary(input_word_1, ftell(output_stream), number_of_occurances_1 + number_of_occurances_2, db); } db->number_of_words++; /* copy file 1 first. this assumes that file 1 was indexed before file 2 */ if((number_of_occurances_1+number_of_occurances_2) > MAX_OCCURANCES){ /* too many already, just make a dummy (0 length) */ if((number_of_occurances_1 < MAX_OCCURANCES) && (number_of_occurances_2 < MAX_OCCURANCES)) { /* only print the first time */ waislog(WLOG_MEDIUM, WLOG_INDEX, "Deleting word '%s' since it has %ld occurences (limit %ld).", input_word_1,number_of_occurances_1+number_of_occurances_2, MAX_OCCURANCES); } make_dummy_block(input_stream_1, input_stream_2, output_stream); } else{ merge_blocks(input_word_1,input_stream_1,input_stream_2, output_stream); } if(-1 > next_dictionary_index_block(&index_block_size_1, &number_of_occurances_1, input_word_1, input_stream_1)) panic("Read of dictionary block failed in file %s", input_filename_1); if(-1 > next_dictionary_index_block(&index_block_size_2, &number_of_occurances_2, input_word_2, input_stream_2)) panic("Read of dictionary block failed in file %s", input_filename_2); } else if(((0 == strlen_1) && (0 != strlen_2) && (0 > compare)) || ((0 != strlen_1) && (0 != strlen_2) && (0 < compare))){ /* write from block 2 */ /* printf("From block 2: writing word '%s' not '%s'\n", input_word_2, input_word_1); */ write_dictionary_index_block(number_of_occurances_2, input_word_2, output_stream); if(generate_dictionary){ add_word_to_dictionary(input_word_2, ftell(output_stream), number_of_occurances_2, db); } db->number_of_words++; copy_block(input_stream_2, output_stream); if(-1 > next_dictionary_index_block(&index_block_size_2, &number_of_occurances_2, input_word_2, input_stream_2)) panic("Read of dictionary block failed in file %s", input_filename_2); } else{ /* write from block 1 */ /* printf("From block 1: writing word '%s' not '%s'\n", input_word_1, input_word_2); */ write_dictionary_index_block(number_of_occurances_1, input_word_1, output_stream); if(generate_dictionary){ add_word_to_dictionary(input_word_1, ftell(output_stream), number_of_occurances_1, db); } db->number_of_words++; copy_block(input_stream_1, output_stream); if(-1 > next_dictionary_index_block(&index_block_size_1, &number_of_occurances_1, input_word_1, input_stream_1)) panic("Read of dictionary block failed in file %s", input_filename_1); } } } /* write out the number of words */ s_fseek(output_stream, 0L, SEEK_SET); write_bytes(db->number_of_words, INDEX_HEADER_SIZE, output_stream); s_fclose(input_stream_1); s_fclose(input_stream_2); s_fclose(output_stream); /* check resulting file */ /* check_index_file(output_filename); */ /* scan_index_blocks(output_filename); */ /* delete the input files */ remove(input_filename_1); remove(input_filename_2); /* These next two calls result in the renaming of the new files ontop of the old files. This is the only critical time in which if an online update is happening (the index files are being changed while the queries are being answered). If a query comes along in another process and opens the .inv file after the rename is done, but before the finished_add_word_to_dictionary has the change to rename the .dct file and therefore opens the old version of the .dct file, then the files will be out of synch. */ if (0 != rename(output_filename, input_filename_1)) waislog(WLOG_HIGH, WLOG_ERROR, "could not rename file %s to %s", output_filename, input_filename_1); if(generate_dictionary) finished_add_word_to_dictionary(db); #ifdef THINK_C /* set the mac file type to INDX */ setFileType(input_filename_1, WAIS_INDEX_FILE_TYPE, CREATOR); #endif /* THINK_C */ return(true); } /* only works on positive, non-zero numbers, and is slow to boot, yippie */ static long logcount _AP((long number)); static long logcount(number) long number; { long answer = 0; for( ; number > 0; number = number >> 1) answer++; return(answer); } /* * Originally, the index files produced while building an index were * merged together using a normal binary merge sort. For example, if * there were 8 files (0 through 7) to be merged, they were merged * in the following order, after all the indexing was finished: * * 0+1 -> 0 2+3 -> 2 4+5 -> 4 6+7 -> 6 * 0+2 -> 0 4+6 -> 4 * 0+4 -> 0 * * The only problem with this is that it means that all of the index * files have to stay around until the entire database has been * indexed. This means that the amount of disk space needs * temporarily during the indexing grows linearly with the amount of * data being indexed. * * The idea of this routine is to reduce the amount of temporary disk * space used by doing whatever merging it can, DURING the indexing * rather than after it. The routine gets called whenever an index * file has just been flushed, and it figures out what merging needs * to be performed and does it. * * The merging done by this routine ends up being the same as the * merging done by the old routine at the end of indexing. However, * it happens in a different order. For example, once again, if the * total number of index files were 8, this is what would happen (a * number by itself means that it was flushed; number+number->number * refers to a merge): * * 0 1 0+1->0 * 2 3 2+3->2 0+2->0 * 4 5 4+5->4 * 6 7 6+7->6 4+6->4 0+4->0 * * You should be able to convince yourself relatively quickly that the * result of this is that the amount of space needed temporarily * during indexing grows logarithmically with the amount of data being * indexed, rather than linearly. What's more, no extra time is required. * * There is one complication. When the total number of files being * indexed is an even power of two, the pattern of merging is regular, * as demonstrated above. However, if the total number of files is * not an even power of two, things are a bit complicated. For * example, if there were 11 files instead of 8, we'd have to add this * to the end of the list given above: * * 8 9 8+9->8 * 10 8+10->8 0+8->0 * * The algorithm for determining what merges to perform after every * second index file, when the index file just flushed is not the last * one, is simple. However, the algorithm for determining what to do * on the last file flushed, whether or not it's a power of two, is a * bit more complicated. * * The first algorithm: * * splitter := the number of the file just indexed (starting from 1) * if splitter is odd * then return * low = splitter - 2 // counting index files from 0 * high = splitter - 1 // ditto * twotothe = 1 * while splitter is even do * merge files low and high into low * high := high - twotothe * twotothe := twotothe * 2 * low := low - twotothe * splitter := splitter / 2 * done * * The translation from this into the code below should be somewhat * obvious. What is NOT obvious, however, is all the stuff with * "target", "realhigh" and "where_to_stop", and with the extra loop * wrapped around the code above. The point of all that is to handle * the second case mentioned above, when we're done indexing and need * to merge everything together. It works by pretending doing a * series of merges of the first type described above, from the actual * last index file until the next power of two higher than it. At * each pretended series, only those merges for which the low and high * index files actually still exist are performed. * * In the degenerate case of the second algorithm, when the last index * file being flushed is an even power of two, the outer loop only * runs once and the behavior is the same as in the first algorithm * described above. * * I hope that explains things adequately :-). */ #ifndef END_MERGE static void do_intermediate_merging _AP((database *db, boolean completely)); static void do_intermediate_merging(db, completely) database *db; boolean completely; { int last = db->index_file_number - 1; int twotothe, low, high, where_to_stop, target, realhigh = last, splitter; char filename[MAX_FILENAME_LEN]; char filename2[MAX_FILENAME_LEN]; if (last % 2 == 0) { if (! completely) { return; } else { for (where_to_stop = 1; where_to_stop < last + 1; where_to_stop *= 2) /* empty */; } } else { if (completely) { for (where_to_stop = 1; where_to_stop < last + 1; where_to_stop *= 2) /* empty */; } else { where_to_stop = last + 1; } } for (target = ((last % 2) == 0) ? last + 2 : last + 1; target <= where_to_stop; target += 2) { low = target - 2; high = target - 1; twotothe = 1; splitter = target; while (((splitter % 2) == 0) && (low >= 0)) { if (low < realhigh) { if (! merge_index_file(low, (high > realhigh) ? realhigh : high, db, completely && (! low) && !probe_file(index_filename(filename, db)))) { panic("Error in merging file %ld into %ld", low, (high > last) ? last : high); } realhigh = low; } high -= twotothe; twotothe *= 2; low -= twotothe; splitter /= 2; } } if (completely) { if (probe_file(index_filename(filename, db))) { merge_index_file(-2L, 0L, db, true); } else if (last == 0) { touch_file(index_filename_with_version(1, filename, db)); merge_index_file(0L, 1, db, true); /* rename 0 into the final name */ index_filename_with_version(0L, filename2, db); index_filename(filename, db); if (0!=rename(filename2,filename)) waislog(WLOG_HIGH, WLOG_ERROR, "could not rename file %s to %s", filename2, filename); } } } #endif /* #ifndef END_MERGE */ static void merge_index_files _AP((database* db)); static void merge_index_files(db) database *db; /* this merges all the temporary inverted files into a large on * and creates a dictionary. * This is done in a logrithmic merge of the files. * An n-ary merge would be faster of course, but what the heck... */ { /* this version does it two at a time */ char filename[MAX_FILENAME_LEN]; char filename2[MAX_FILENAME_LEN]; long level; long final_level; long number_of_files_to_merge = db->index_file_number; /* at level 0, merge 0,1->0; 2,3->2; 4,5->4; 6,7->6; at level 1, merge 0,2->0; 4,6->4; at level 2, merge 0,4->0; stop. If there is only 1 file, then dont merge (final_level will be -1), the dictionary will be generated by merge after the for loop. */ final_level = logcount(number_of_files_to_merge - 1) - 1; for(level = 0; level <= final_level; level++){ long into_version; for(into_version = 0; into_version < number_of_files_to_merge; into_version += (2 << level)){ long from_version = into_version + (1 << level); if(from_version < number_of_files_to_merge){ boolean generate_dictionary; /* if this is the final level and there is no .inv file then yes */ if(level == final_level){ if(probe_file(index_filename(filename, db))) generate_dictionary = false; else generate_dictionary = true; } else{ generate_dictionary = false; } /* printf("Level %d (out of %d) merging into %d from %d\n", level, final_level, into_version, from_version); */ if(false == merge_index_file(into_version, from_version, db, generate_dictionary)) panic("Error in merging file %ld into %ld", from_version, into_version); } } } /* done merging */ if(probe_file(index_filename(filename, db))){ /* if there is a .inv file, then we are adding to an existing db. merge */ /* do this by merging straight into the final using the special -2 version for the .inv file */ merge_index_file(-2L, 0L, db, true); } else if(number_of_files_to_merge == 1){ /* then we have to generate a dictionary for this one file. create a dummy file in 1, then merge that while making a dictionary */ touch_file(index_filename_with_version(1, filename, db)); merge_index_file(0L, 1, db, true); /* rename 0 into the final name */ index_filename_with_version(0L,filename2, db); index_filename(filename, db); if (0!=rename(filename2,filename)) waislog(WLOG_HIGH, WLOG_ERROR, "could not rename file %s to %s", filename2, filename); } } /* ============================================ */ /* === Flushing the memory version of the === */ /* === word hashtable to disk files === */ /* ============================================ */ static void flush_word_entry_to_file _AP((hash_entry* wrd_entry, database* db)); static void flush_word_entry_to_file(wrd_entry,db) hash_entry* wrd_entry; database* db; { /* In this version, each word_entry is made into an index block. This means that there may be many small index blocks if the memory can not hold many files worth of data. This approach was taken for simplicities sake and it might be the best approach if there is a small vocabulary. This assumes that the index stream is positioned at the end. */ if(wrd_entry->number_of_occurances > MAX_OCCURANCES){ /* there are too many of this word, do nothing. * this may result in some having been written before, or * no occurances of this word might be recorded. Is this right? * if the first MAX_OCCURANCES want to be recored, then * add this clause to this condition: * && wrd_entry->starting_block_number != 0 */ return; } if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr + INDEX_BLOCK_HEADER_SIZE) >= (1L << (8 * INDEX_BLOCK_SIZE_SIZE))) return; /* there are too many entries to store away, therefore throw it all away */ if(NULL == wrd_entry->memory_ptr){ /* not entries to write out */ return; } if(0 == (wrd_entry->current_memory_ptr - wrd_entry->memory_ptr)){ return; /* there are no word entries to write */ } /* printf("Flushing word %s\n", wrd_entry->key); */ /* if this is the entry for this word, the put in the dictionary entry in the index file */ write_dictionary_index_block(wrd_entry->number_of_occurances, wrd_entry->key, db->index_stream); db->number_of_words++; /* allocate and write the new block */ allocate_index_block(wrd_entry->current_memory_ptr - wrd_entry->memory_ptr + INDEX_BLOCK_HEADER_SIZE, db->index_stream); /* cprintf(PRINT_AS_INDEXING, "New block number: %ld\n", new_block); */ #ifdef WIN32 if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr) != (long)fwrite(wrd_entry->memory_ptr, 1L, (wrd_entry->current_memory_ptr - wrd_entry->memory_ptr), db->index_stream)) #else if((wrd_entry->current_memory_ptr - wrd_entry->memory_ptr) != fwrite(wrd_entry->memory_ptr, 1L, (wrd_entry->current_memory_ptr - wrd_entry->memory_ptr), db->index_stream)) #endif panic("Write failed"); /* free the memory for the block written out */ free_word_occurance_block(wrd_entry->memory_ptr); wrd_entry->memory_ptr = NULL; wrd_entry->current_memory_ptr = NULL; wrd_entry->memory_size = 0; wrd_entry->current_doc_id = 0; } /* This flushes all of the memory version of the word hashtable to disk. * this is called when the hashtable is filling up or we have * accumulated enough words. If completely is true, then it will merge * all the intermediate files. */ void flush_memory_hashtable_to_disk(db,completely) database* db; boolean completely; { /* map over the memory word hashtable and write the entries to disk */ long i; char filename[1000]; /* analyze the hash distribution analyze_hashtable_distribution(db->the_word_memory_hashtable); */ #ifdef WIN32 /* Close the index file if it is open */ if (db->index_stream!=NULL) { s_fclose(db->index_stream); } #endif db->index_stream = s_fopen(index_filename_with_version(db->index_file_number++, filename, db), "wb"); if(NULL == db->index_stream) panic("Could not open file %s to insert index", index_filename_with_version(db->index_file_number, filename, db)); waislog(WLOG_MEDIUM, WLOG_INDEX, "Flushing %ld different words, %ld total words to disk...", hashtable_count(db->the_word_memory_hashtable), db->number_of_words_in_hashtable); db->number_of_words = 0; write_bytes(0l, INDEX_HEADER_SIZE, db->index_stream); sort_hashtable(db->the_word_memory_hashtable); for(i = 0; i < hashtable_count(db->the_word_memory_hashtable); i++){ flush_word_entry_to_file(&db->the_word_memory_hashtable->contents[i], db); } /* write out the number of entries into the index_file header */ s_fseek(db->index_stream, 0L, SEEK_SET); write_bytes(db->number_of_words, INDEX_HEADER_SIZE, db->index_stream); s_fclose(db->index_stream); /* since everything is written out, we can flush these. */ flush_word_occurance_buffers(); clr_hashtable(db->the_word_memory_hashtable); db->number_of_words_in_hashtable = 0; /* add the stopwords to the index for the next session. */ add_stop_words(db->the_word_memory_hashtable); waislog(WLOG_MEDIUM, WLOG_INDEX, "Done flushing version %ld", db->index_file_number - 1); /* scan_index_blocks(filename); */ #ifdef END_MERGE if(completely){ waislog(WLOG_MEDIUM, WLOG_INDEX, "Merging %ld files", db->index_file_number); merge_index_files(db); waislog(WLOG_MEDIUM, WLOG_INDEX, "Done merging."); } #else do_intermediate_merging(db, completely); #endif /* #ifdef END_MERGE */ } long init_add_word(db, hashtable_size, flush_after_n_words) database *db; long hashtable_size; long flush_after_n_words; { char fn[256]; if(NULL != db->the_word_memory_hashtable) free_hashtable(db->the_word_memory_hashtable); db->the_word_memory_hashtable = make_hashtable(0, hashtable_size, sizeof(hash_entry)); db->flush_after_n_words = flush_after_n_words; add_stop_words(db->the_word_memory_hashtable); strcpy( fn,db->database_file ); strcat( fn,synonym_ext ); syn_ReadFile( fn,&db->syn_Table,&db->syn_Table_Size ); return(0); } long finished_add_word(db) database *db; { flush_memory_hashtable_to_disk(db,true); syn_Free( db->syn_Table,&db->syn_Table_Size ); return(0); /* successful */ }